# ==============================================================================
# file name: 03-summarize-repeated-participation-YG.sql
# date: Mar 12, 2024
# author: Bernhard Clemm 
# purpose: summarize attempts at repeated participation on person level (YouGov)
# THIS SCRIPT REQUIRES ACCESS TO THE RAW DATA AND SERVES FOR REFERENCE ONLY
# ==============================================================================

# DATA =========================================================================

repeated_YG <- read.csv("") %>% # repeated_participation_YG.csv
  rename("person_id" = caseid) %>%
  mutate(dataset = "Yougov",
         person_id = as.character(person_id))

# MAIN =========================================================================

## indicators whether questionnaire URL was visited repeatedly ####

repeated_YG <- repeated_YG %>%
  mutate(created_utc = as.POSIXct(created_utc, format = "%Y-%m-%d %H:%M:%OS")) %>%
  arrange(person_id, url_quest, created_utc) %>%
  # by grouping by url_quest, we ensure that a "next" visit time is created only for the same questionnaire URL
  group_by(person_id, url_quest) %>%
  mutate(created_utc_next = dplyr::lead(created_utc)) %>% ungroup() %>%
  mutate(timediff = difftime(created_utc_next, created_utc, units = "mins")) %>%
  mutate(
    same_1h = ifelse(timediff < 60, T, F),
    same_6h = ifelse(timediff < 60*6, T, F),
    same_24h = ifelse(timediff < 60*24, T, F)) %>%
  mutate(across(c(same_1h, same_6h, same_24h), ~ ifelse(is.na(timediff), F, .))) %>%
  mutate(domain = ada_get_domain(url), 
         host = ada_get_host(url)) %>%
  mutate(platform = case_when(
    domain == "confirmit.com" ~ "Confirmit",
    grepl("surveygizmo", host) ~ "Surveygizmo",
    domain == "surveymonkey.com" ~ "Surveymonkey",
    domain == "qualtrics.com" ~ "Qualtrics",
    domain == "cmix.com" ~ "Dynata",
    domain == "questionpro.com" ~ "Questionpro",
    domain == "formsite.com" ~ "Formsite",
    domain %in% c("unipark.com", "unipark.de") ~ "Unipark",
    domain == "typeform.com" ~ "Typeform",
    domain == "formstack.com" ~ "Formstack",
    grepl("zohopublic", host) ~ "Zoho"))

## person-level summary (1-hour cutoff) ####

# subset to visits not within 1 hour and not subsequent
repeated_YG_1h <- repeated_YG %>%
  filter(same_1h == F & same_as_prev_url == 0) 

# person-URL level summary
repeated_YG_1h_url_people <- repeated_YG_1h %>%
  group_by(dataset, person_id, url_quest) %>%
  summarise(count = n()) %>% 
  ungroup() %>%
  mutate(repeated = ifelse(count > 1, 1, 0)) 

## person-level summary
repeated_YG_1h_people <- repeated_YG_1h_url_people %>%
  group_by(dataset, person_id) %>%
  summarize(count_urls_1h = n(), # number of surveys taken
            count_repeated_urls_1h = sum(repeated)) %>% # sum of repeated surveys
  mutate(any_repeated_1h = ifelse(count_repeated_urls_1h > 0, 1, 0)) %>%
  mutate(prop_repeated_1h = count_repeated_urls_1h / count_urls_1h) 

## person-level summary (6-hour cutoff) ####

repeated_YG_6h <- repeated_YG %>%
  filter(same_6h == F & same_as_prev_url == 0) 

## Person-URL level summary
repeated_YG_6h_url_people <- repeated_YG_6h %>%
  group_by(dataset, person_id, url_quest) %>%
  summarise(count = n()) %>% 
  ungroup() %>%
  mutate(repeated = ifelse(count > 1, 1, 0)) 

## Person-level summary
repeated_YG_6h_people <- repeated_YG_6h_url_people %>%
  group_by(dataset, person_id) %>%
  summarize(count_urls_6h = n(), # number of surveys takens
            count_repeated_urls_6h = sum(repeated)) %>% # sum of repeated surveys
  mutate(any_repeated_6h = ifelse(count_repeated_urls_6h > 0, 1, 0)) %>%
  mutate(prop_repeated_6h = count_repeated_urls_6h / count_urls_6h)

## person-level summary (24-hour cutoff) ####

repeated_YG_24h <- repeated_YG %>%
  filter(same_24h == F & same_as_prev_url == 0) 

## Person-URL level summary
repeated_YG_24h_url_people <- repeated_YG_24h %>%
  group_by(dataset, person_id, url_quest) %>%
  summarise(count = n()) %>% 
  ungroup() %>%
  mutate(repeated = ifelse(count > 1, 1, 0)) 

## Person-level summary
repeated_YG_24h_people <- repeated_YG_24h_url_people %>%
  group_by(dataset, person_id) %>%
  summarize(count_urls_24h = n(), # number of surveys takens
            count_repeated_urls_24h = sum(repeated)) %>% # sum of repeated surveys
  mutate(any_repeated_24h = ifelse(count_repeated_urls_24h > 0, 1, 0)) %>%
  mutate(prop_repeated_24h = count_repeated_urls_24h / count_urls_24h)

## join datasets ####
people_repeated_YG <- repeated_YG_1h_people %>%
  full_join(., repeated_YG_6h_people %>% select(-dataset)) %>%
  full_join(., repeated_YG_24h_people %>% select(-dataset))

write_csv(people_repeated_YG, "data/browsing_summarized/people_repeated_YG.csv")

# DISAGGREGATION BY PLATFORM ===================================================

# Note: the following technically needs to be run after weights are created (2-recode-weight-surveys.R)
# However, since it is based on data not aggregated enough to be published,
# we do not include it in the published pipeline and report it here.

repeated_YG_1h_url_people_platform <- repeated_YG_1h %>%
  group_by(dataset, platform, person_id, url_quest) %>%
  summarise(count = n()) %>% 
  ungroup() %>%
  mutate(repeated = ifelse(count > 1, 1, 0)) 

repeated_YG_1h_people_platform <- repeated_YG_1h_url_people_platform %>%
  group_by(dataset, platform, person_id) %>%
  summarize(count_urls = n(), # number of surveys takens
            count_repeated_urls = sum(repeated)) %>% # sum of repeated surveys
  mutate(any_repeated = ifelse(count_repeated_urls > 0, 1, 0)) %>%
  mutate(prop_repeated = count_repeated_urls / count_urls) 

weights_YG <- read.csv("data/analysis_YG.csv") %>% 
  select(person_id, weight, n_days_active)%>%
  mutate(person_id = as.character(person_id))

repeated_YG_1h_people_platform <- repeated_YG_1h_people_platform %>%
  full_join(., weights_YG, by = "person_id") %>%
  # fill in 0 for those people not in the rep data
  mutate(across(c(count_urls, count_repeated_urls, any_repeated, prop_repeated),
                ~ ifelse(is.na(.), 0, .))) %>%
  filter(n_days_active >= 7) 

repeated_YG_1h_people_platform <- repeated_YG_1h_people_platform %>%
  mutate(weight=ifelse(is.na(weight), 1, weight)) %>%
  as_survey_design(weights = weight) 

repeated_YG_1h_platform <- repeated_YG_1h_people_platform %>%
  filter(!is.na(platform)) %>%
  group_by(platform) %>%
  summarise(any_repeated_perc = round(survey_mean(any_repeated) * 100, 2),
            count_repeated_urls_mean = round(survey_mean(count_repeated_urls), 2),
            perc_repeated_mean = round(survey_mean(prop_repeated) * 100, 2)) %>%
  select(-contains("_se")) 

write_csv(repeated_YG_1h_platform, "data/browsing_summarized/platforms_repeated_YG.csv")
